import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
from warnings import filterwarnings
filterwarnings("ignore")
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
data=pd.read_csv("C:\\Users\\laxma\\Downloads\\placement-dataset.csv")
data
| city | cgpa | iq | placement | |
|---|---|---|---|---|
| 0 | New York | 6.8 | 123.0 | 1 |
| 1 | Los Angeles | 5.9 | 106.0 | 0 |
| 2 | Chicago | NaN | 121.0 | 0 |
| 3 | New York | 7.4 | 132.0 | 1 |
| 4 | Los Angeles | 5.8 | 142.0 | 0 |
| ... | ... | ... | ... | ... |
| 95 | Chicago | 4.3 | 200.0 | 0 |
| 96 | New York | 4.4 | 42.0 | 0 |
| 97 | Los Angeles | 6.7 | 182.0 | 1 |
| 98 | Chicago | 6.3 | 103.0 | 1 |
| 99 | New York | 6.2 | 113.0 | 1 |
100 rows × 4 columns
data.head()
| city | cgpa | iq | placement | |
|---|---|---|---|---|
| 0 | New York | 6.8 | 123.0 | 1 |
| 1 | Los Angeles | 5.9 | 106.0 | 0 |
| 2 | Chicago | NaN | 121.0 | 0 |
| 3 | New York | 7.4 | 132.0 | 1 |
| 4 | Los Angeles | 5.8 | 142.0 | 0 |
data.tail()
| city | cgpa | iq | placement | |
|---|---|---|---|---|
| 95 | Chicago | 4.3 | 200.0 | 0 |
| 96 | New York | 4.4 | 42.0 | 0 |
| 97 | Los Angeles | 6.7 | 182.0 | 1 |
| 98 | Chicago | 6.3 | 103.0 | 1 |
| 99 | New York | 6.2 | 113.0 | 1 |
data.describe()
| cgpa | iq | placement | |
|---|---|---|---|
| count | 92.000000 | 96.000000 | 100.000000 |
| mean | 5.965217 | 117.916667 | 0.460000 |
| std | 1.164911 | 46.913508 | 0.500908 |
| min | 3.300000 | 1.000000 | 0.000000 |
| 25% | 5.000000 | 90.000000 | 0.000000 |
| 50% | 6.000000 | 122.000000 | 0.000000 |
| 75% | 6.825000 | 146.750000 | 1.000000 |
| max | 8.500000 | 233.000000 | 1.000000 |
data.describe
<bound method NDFrame.describe of city cgpa iq placement 0 New York 6.8 123.0 1 1 Los Angeles 5.9 106.0 0 2 Chicago NaN 121.0 0 3 New York 7.4 132.0 1 4 Los Angeles 5.8 142.0 0 .. ... ... ... ... 95 Chicago 4.3 200.0 0 96 New York 4.4 42.0 0 97 Los Angeles 6.7 182.0 1 98 Chicago 6.3 103.0 1 99 New York 6.2 113.0 1 [100 rows x 4 columns]>
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 city 100 non-null object 1 cgpa 92 non-null float64 2 iq 96 non-null float64 3 placement 100 non-null int64 dtypes: float64(2), int64(1), object(1) memory usage: 3.2+ KB
data.isnull().sum()
city 0 cgpa 8 iq 4 placement 0 dtype: int64
data=data.dropna()
data
| city | cgpa | iq | placement | |
|---|---|---|---|---|
| 0 | New York | 6.8 | 123.0 | 1 |
| 1 | Los Angeles | 5.9 | 106.0 | 0 |
| 3 | New York | 7.4 | 132.0 | 1 |
| 4 | Los Angeles | 5.8 | 142.0 | 0 |
| 5 | Chicago | 7.1 | 48.0 | 1 |
| ... | ... | ... | ... | ... |
| 95 | Chicago | 4.3 | 200.0 | 0 |
| 96 | New York | 4.4 | 42.0 | 0 |
| 97 | Los Angeles | 6.7 | 182.0 | 1 |
| 98 | Chicago | 6.3 | 103.0 | 1 |
| 99 | New York | 6.2 | 113.0 | 1 |
88 rows × 4 columns
data.isnull().sum()
city 0 cgpa 0 iq 0 placement 0 dtype: int64
data.shape
(88, 4)
data.duplicated().sum()
0
data.columns
Index(['city', 'cgpa', 'iq', 'placement'], dtype='object')
#VISUALIZATION
plt.bar(data['placement'],data['cgpa'])
plt.xticks(rotation=90)
plt.show()
fig=px.bar(data,x='iq',y='city',color='city')
fig.show()
plt.figure(figsize=(10,4))
sns.countplot(x='placement', data=data, color='r')
plt.show()
sns.lineplot(x='city', y='placement', data=data).set_title('placement with city')
Text(0.5, 1.0, 'placement with city')
sns.barplot(data['placement'],data['iq'],color='r')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data,x='cgpa', y='city')
plt.title('cgpa and city')
plt.xlabel('cgpa')
plt.ylabel('city')
plt.show()
sns.displot(data["city"])
<seaborn.axisgrid.FacetGrid at 0x20132f70f40>
sns.boxplot(x='cgpa',y='iq',data=data)
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38]),
[Text(0, 0, '3.3'),
Text(1, 0, '3.5'),
Text(2, 0, '3.9'),
Text(3, 0, '4.0'),
Text(4, 0, '4.3'),
Text(5, 0, '4.4'),
Text(6, 0, '4.6'),
Text(7, 0, '4.7'),
Text(8, 0, '4.8'),
Text(9, 0, '4.9'),
Text(10, 0, '5.0'),
Text(11, 0, '5.1'),
Text(12, 0, '5.2'),
Text(13, 0, '5.3'),
Text(14, 0, '5.4'),
Text(15, 0, '5.7'),
Text(16, 0, '5.8'),
Text(17, 0, '5.9'),
Text(18, 0, '6.0'),
Text(19, 0, '6.1'),
Text(20, 0, '6.2'),
Text(21, 0, '6.3'),
Text(22, 0, '6.4'),
Text(23, 0, '6.5'),
Text(24, 0, '6.6'),
Text(25, 0, '6.7'),
Text(26, 0, '6.8'),
Text(27, 0, '6.9'),
Text(28, 0, '7.0'),
Text(29, 0, '7.1'),
Text(30, 0, '7.3'),
Text(31, 0, '7.4'),
Text(32, 0, '7.5'),
Text(33, 0, '7.6'),
Text(34, 0, '7.8'),
Text(35, 0, '8.0'),
Text(36, 0, '8.1'),
Text(37, 0, '8.3'),
Text(38, 0, '8.5')])
sns.countplot(data=data, x='cgpa',color='yellowgreen')
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38]),
[Text(0, 0, '3.3'),
Text(1, 0, '3.5'),
Text(2, 0, '3.9'),
Text(3, 0, '4.0'),
Text(4, 0, '4.3'),
Text(5, 0, '4.4'),
Text(6, 0, '4.6'),
Text(7, 0, '4.7'),
Text(8, 0, '4.8'),
Text(9, 0, '4.9'),
Text(10, 0, '5.0'),
Text(11, 0, '5.1'),
Text(12, 0, '5.2'),
Text(13, 0, '5.3'),
Text(14, 0, '5.4'),
Text(15, 0, '5.7'),
Text(16, 0, '5.8'),
Text(17, 0, '5.9'),
Text(18, 0, '6.0'),
Text(19, 0, '6.1'),
Text(20, 0, '6.2'),
Text(21, 0, '6.3'),
Text(22, 0, '6.4'),
Text(23, 0, '6.5'),
Text(24, 0, '6.6'),
Text(25, 0, '6.7'),
Text(26, 0, '6.8'),
Text(27, 0, '6.9'),
Text(28, 0, '7.0'),
Text(29, 0, '7.1'),
Text(30, 0, '7.3'),
Text(31, 0, '7.4'),
Text(32, 0, '7.5'),
Text(33, 0, '7.6'),
Text(34, 0, '7.8'),
Text(35, 0, '8.0'),
Text(36, 0, '8.1'),
Text(37, 0, '8.3'),
Text(38, 0, '8.5')])
sns.histplot(data, x="cgpa", hue="placement", multiple="stack", bins = 50, kde=True)
<AxesSubplot:xlabel='cgpa', ylabel='Count'>
#MODEL BUILDING
X = data.drop(['city'], axis=1)
y = data['placement']
X.head()
| cgpa | iq | placement | |
|---|---|---|---|
| 0 | 6.8 | 123.0 | 1 |
| 1 | 5.9 | 106.0 | 0 |
| 3 | 7.4 | 132.0 | 1 |
| 4 | 5.8 | 142.0 | 0 |
| 5 | 7.1 | 48.0 | 1 |
y.head()
0 1 1 0 3 1 4 0 5 1 Name: placement, dtype: int64
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)
X_train.shape, X_test.shape
((44, 3), (44, 3))
X_train.dtypes
cgpa float64 iq float64 placement int64 dtype: object
from sklearn.tree import DecisionTreeClassifier
DTree = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
DTree.fit(X_train, y_test)
DecisionTreeClassifier(max_depth=3, random_state=0)
y_pred = DTree.predict(X_test)
y_pred
array([1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1],
dtype=int64)
from sklearn.metrics import accuracy_score
print('model accuracy score with criterion gini index: {0:04f}'. format (accuracy_score(y_test, y_pred)))
model accuracy score with criterion gini index: 0.318182
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix
array([[11, 12],
[18, 3]], dtype=int64)
plt.figure(figsize=(10,8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='crest', cbar=False)
<AxesSubplot:>
from sklearn.metrics import classification_report
class_report = classification_report(y_test, y_pred)
print(class_report)
precision recall f1-score support
0 0.38 0.48 0.42 23
1 0.20 0.14 0.17 21
accuracy 0.32 44
macro avg 0.29 0.31 0.29 44
weighted avg 0.29 0.32 0.30 44
plt.figure(figsize=(8,6))
from sklearn import tree
tree.plot_tree(DTree.fit(X_train, y_train))
[Text(0.5, 0.75, 'X[2] <= 0.5\ngini = 0.499\nsamples = 44\nvalue = [21, 23]'), Text(0.25, 0.25, 'gini = 0.0\nsamples = 21\nvalue = [21, 0]'), Text(0.75, 0.25, 'gini = 0.0\nsamples = 23\nvalue = [0, 23]')]